#Pre-processing and Cleaning Data
#Calling head() on the data frame initial.cbb.data to observe the layout
head(initial.cbb.data)
## Rank Team Wins AdjEM AdjO AdjD AdjT Luck AdjEM.1 OppO
## 1 1 Virginia\xca 35 34.22 123.4 89.2 59.4 0.050 11.18 109.2
## 2 2 Gonzaga\xca 33 32.85 124.5 91.6 70.2 -0.001 4.46 106.9
## 3 3 Michigan St.\xca 32 30.81 121.0 90.2 66.9 0.001 13.67 110.6
## 4 4 Duke\xca 32 30.62 120.0 89.3 72.1 0.018 12.85 110.7
## 5 5 Texas Tech\xca 31 30.03 114.1 84.1 66.6 0.004 11.18 109.8
## 6 6 Michigan\xca 30 28.32 114.5 86.2 64.8 -0.001 11.27 109.2
## OppD AdjEM.2 PtsPerGame
## 1 98.1 -3.24 71.8
## 2 102.5 1.87 88.8
## 3 96.9 3.24 79.2
## 4 97.8 5.08 83.5
## 5 98.7 -5.39 73.1
## 6 98.0 -5.42 70.7
# After calling the data frame initial.cbb.data there is an observed special character
# attached to the end of each Team name, that was included from excel which we used to
# create our csv file, I am going to remove the team name category. All teams have a ranking
# which corresponds to the index they are stored at. We can store team names as a separate
# vector which will maintain the index.
teamNames <- initial.cbb.data$Team
teamNames #call to make sure the right data was stored in teamNames variable
## [1] Virginia\xca Gonzaga\xca Michigan St.\xca
## [4] Duke\xca Texas Tech\xca Michigan\xca
## [7] North Carolina\xca Kentucky\xca Purdue\xca
## [10] Tennessee\xca Auburn\xca Houston\xca
## [13] Virginia Tech\xca Florida St.\xca Iowa St.\xca
## [16] Wisconsin\xca Kansas\xca Wofford\xca
## [19] LSU\xca Kansas St.\xca Mississippi St.\xca
## [22] Buffalo\xca Louisville\xca Maryland\xca
## [25] Texas\xca Florida\xca Nevada\xca
## [28] Oregon\xca Cincinnati\xca Villanova\xca
## [31] Saint Mary's\xca Oklahoma\xca Marquette\xca
## [34] UCF\xca Baylor\xca Clemson\xca
## [37] Iowa\xca Utah St.\xca Syracuse\xca
## [40] TCU\xca N.C. State\xca VCU\xca
## [43] Penn St. Ohio St.\xca Lipscomb\xca
## [46] Minnesota\xca Nebraska\xca Washington\xca
## [49] Belmont\xca Mississippi\xca Murray St.\xca
## [52] Indiana\xca New Mexico St.\xca Arkansas\xca
## [55] Creighton\xca Memphis\xca Arizona St.\xca
## [58] Liberty\xca Furman\xca Seton Hall\xca
## [61] Toledo\xca Dayton\xca Colorado\xca
## [64] Alabama\xca Xavier\xca Wichita St.\xca
## [67] San Francisco Missouri Temple\xca
## [70] South Carolina Fresno St. Butler\xca
## [73] UC Irvine\xca Northwestern Miami FL
## [76] Vermont\xca Yale\xca Rutgers
## [79] Providence\xca East Tennessee St. Oregon St.
## [82] USC Oklahoma St. Illinois
## [85] Davidson\xca BYU UNC Greensboro\xca
## [88] St. John's\xca Northeastern\xca San Diego\xca
## [91] Texas A&M South Dakota St.\xca Hofstra\xca
## [94] Arizona West Virginia Northern Kentucky\xca
## [97] Notre Dame Connecticut South Florida
## [100] Georgetown\xca
## 100 Levels: Alabama\xca Arizona Arizona St.\xca Arkansas\xca ... Yale\xca
# Next we will remove the special excel character attached to the end of each team name
teamNames <- str_extract_all(teamNames,"[A-Z a-z]+")
#Now that teamNames is stored in a separate vector we will remove teamNames from the data set
initial.cbb.data <- initial.cbb.data[,-2]
head(initial.cbb.data)
## Rank Wins AdjEM AdjO AdjD AdjT Luck AdjEM.1 OppO OppD AdjEM.2
## 1 1 35 34.22 123.4 89.2 59.4 0.050 11.18 109.2 98.1 -3.24
## 2 2 33 32.85 124.5 91.6 70.2 -0.001 4.46 106.9 102.5 1.87
## 3 3 32 30.81 121.0 90.2 66.9 0.001 13.67 110.6 96.9 3.24
## 4 4 32 30.62 120.0 89.3 72.1 0.018 12.85 110.7 97.8 5.08
## 5 5 31 30.03 114.1 84.1 66.6 0.004 11.18 109.8 98.7 -5.39
## 6 6 30 28.32 114.5 86.2 64.8 -0.001 11.27 109.2 98.0 -5.42
## PtsPerGame
## 1 71.8
## 2 88.8
## 3 79.2
## 4 83.5
## 5 73.1
## 6 70.7
# Also since each team ranking is synonymous with it's index we will also remove the rank column
teamRank <- initial.cbb.data$Rank
head(teamRank) #integer values 1 through 100
## [1] 1 2 3 4 5 6
initial.cbb.data <- initial.cbb.data[,-1]
head(initial.cbb.data)
## Wins AdjEM AdjO AdjD AdjT Luck AdjEM.1 OppO OppD AdjEM.2 PtsPerGame
## 1 35 34.22 123.4 89.2 59.4 0.050 11.18 109.2 98.1 -3.24 71.8
## 2 33 32.85 124.5 91.6 70.2 -0.001 4.46 106.9 102.5 1.87 88.8
## 3 32 30.81 121.0 90.2 66.9 0.001 13.67 110.6 96.9 3.24 79.2
## 4 32 30.62 120.0 89.3 72.1 0.018 12.85 110.7 97.8 5.08 83.5
## 5 31 30.03 114.1 84.1 66.6 0.004 11.18 109.8 98.7 -5.39 73.1
## 6 30 28.32 114.5 86.2 64.8 -0.001 11.27 109.2 98.0 -5.42 70.7
sum(is.na(initial.cbb.data))
## [1] 0
##The sum is 0, meaning that there are no NA's in the dataset
#In our model our response is going to be total number of wins
teamWins <- initial.cbb.data$Wins
# Now that teamWins is stored as a vector, we can remove it from the dataset
# Wins is stored as the first column
cleaned.cbb.data <- initial.cbb.data[,-1]
head(cleaned.cbb.data) # Double check to make sure that wins is removed from the dataset
## AdjEM AdjO AdjD AdjT Luck AdjEM.1 OppO OppD AdjEM.2 PtsPerGame
## 1 34.22 123.4 89.2 59.4 0.050 11.18 109.2 98.1 -3.24 71.8
## 2 32.85 124.5 91.6 70.2 -0.001 4.46 106.9 102.5 1.87 88.8
## 3 30.81 121.0 90.2 66.9 0.001 13.67 110.6 96.9 3.24 79.2
## 4 30.62 120.0 89.3 72.1 0.018 12.85 110.7 97.8 5.08 83.5
## 5 30.03 114.1 84.1 66.6 0.004 11.18 109.8 98.7 -5.39 73.1
## 6 28.32 114.5 86.2 64.8 -0.001 11.27 109.2 98.0 -5.42 70.7
# We calculate the coefficients of variation by dividing the sample standard deviation by the
# sample mean. This will tell us the variations that happens for each feature in the dataset
# we will get rid of any features that have variance < 0.05
#all of the features in cleaned.cbb.data is of the numeric type
#Calculate sample means for each feature
column.sample.mean <- colMeans(cleaned.cbb.data)
column.sample.mean
## AdjEM AdjO AdjD AdjT Luck AdjEM.1
## 14.77760 111.82700 97.04700 67.51200 0.00367 6.20380
## OppO OppD AdjEM.2 PtsPerGame
## 107.47700 101.27500 -0.21880 75.05400
#Calculate sample standard deviations for each feature
column.sample.std <- apply(cleaned.cbb.data,2,sd)
column.sample.std
## AdjEM AdjO AdjD AdjT Luck AdjEM.1
## 6.87334996 4.51410483 4.33546463 2.68171060 0.04699711 5.34770484
## OppO OppD AdjEM.2 PtsPerGame
## 2.52713976 2.93217522 4.09242767 4.93049509
#Calculate Coefficients of Variations
coef.variation <- column.sample.std/column.sample.mean
coef.variation
## AdjEM AdjO AdjD AdjT Luck
## 0.46511950 0.04036686 0.04467387 0.03972198 12.80575215
## AdjEM.1 OppO OppD AdjEM.2 PtsPerGame
## 0.86200471 0.02351331 0.02895261 -18.70396557 0.06569264
#Next we call summary of coef.variation to observe
summary(coef.variation)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -18.70397 0.03164 0.04252 -0.43282 0.36526 12.80575
#Plotting coefficient of variation against the median value, we see that there are only
# two features which have enough variance that would be ideal to keep: luck and AdjEM.1
# However, when we create a model with just those two variables our multiple R-squared value
# falls from 0.94 to 0.33, so we are losing a lot of explained data.
invisible(plot(coef.variation, main= 'Figure 1: Coefficient of Variation') +
abline(h = median(coef.variation), col ='blue') + abline(h = 0, col = 'red'))
# Initially our strategy was to remove any features that have a coefficient of variation
# less than 0.05, however our median variation is 0.4252, so instead we will remove any features
# that are less than or equal to the 1st quartile coefficient of variation = 0.03164
featuresToKeep <- which(coef.variation > 0.03164)
length(featuresToKeep)
## [1] 7
featuresToDrop <- which(coef.variation <= 0.03164)
length(featuresToDrop)
## [1] 3
# We are removing three variables from our dataset (OppO, OppD, AdjEM.2), which is fine because
# the information contained in those variables are used in the variables we are keeping
cleaned.cbb.data <- cleaned.cbb.data[,featuresToKeep]
head(cleaned.cbb.data) # contains the 7 variables we want
## AdjEM AdjO AdjD AdjT Luck AdjEM.1 PtsPerGame
## 1 34.22 123.4 89.2 59.4 0.050 11.18 71.8
## 2 32.85 124.5 91.6 70.2 -0.001 4.46 88.8
## 3 30.81 121.0 90.2 66.9 0.001 13.67 79.2
## 4 30.62 120.0 89.3 72.1 0.018 12.85 83.5
## 5 30.03 114.1 84.1 66.6 0.004 11.18 73.1
## 6 28.32 114.5 86.2 64.8 -0.001 11.27 70.7
##Simple Linear Regression Model
cbb.regr <- lm(teamWins~cleaned.cbb.data$AdjEM+cleaned.cbb.data$AdjO+cleaned.cbb.data$AdjD+cleaned.cbb.data$AdjT+cleaned.cbb.data$Luck+cleaned.cbb.data$AdjEM.1+cleaned.cbb.data$PtsPerGame, data = cleaned.cbb.data)
#save summary of initial model
summary.cbb.regr <- summary(cbb.regr)
summary.cbb.regr
##
## Call:
## lm(formula = teamWins ~ cleaned.cbb.data$AdjEM + cleaned.cbb.data$AdjO +
## cleaned.cbb.data$AdjD + cleaned.cbb.data$AdjT + cleaned.cbb.data$Luck +
## cleaned.cbb.data$AdjEM.1 + cleaned.cbb.data$PtsPerGame, data = cleaned.cbb.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9842 -0.7579 -0.0630 0.7405 3.7859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.3200 10.9100 3.054 0.00295 **
## cleaned.cbb.data$AdjEM 2.0372 3.2683 0.623 0.53461
## cleaned.cbb.data$AdjO -1.4800 3.2557 -0.455 0.65047
## cleaned.cbb.data$AdjD 1.2899 3.2740 0.394 0.69452
## cleaned.cbb.data$AdjT -0.1551 0.1428 -1.086 0.28015
## cleaned.cbb.data$Luck 38.8316 3.0144 12.882 < 2e-16 ***
## cleaned.cbb.data$AdjEM.1 -0.5397 0.0667 -8.092 2.32e-12 ***
## cleaned.cbb.data$PtsPerGame 0.1823 0.1223 1.490 0.13963
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.319 on 92 degrees of freedom
## Multiple R-squared: 0.9444, Adjusted R-squared: 0.9402
## F-statistic: 223.2 on 7 and 92 DF, p-value: < 2.2e-16
# In our summary we have a residual standard error of 1.319 which is pretty good, actually it is
# outstanding. Also our Multiple R-squared value = 0.9444 which suggests that 94% of our variance # is explained, which is also outstanding. Even our Adjusted R-squared value is 0.9402 which
# still suggests that 94% of our variance is explained.
AdjEM.beta.hat <- cbb.regr$coefficients[2]
AdjO.beta.hat <- cbb.regr$coefficients[3]
AdjD.beta.hat <- cbb.regr$coefficients[4]
AdjT.beta.hat <- cbb.regr$coefficients[5]
Luck.beta.hat <- cbb.regr$coefficients[6]
AdjEM1.beta.hat <- cbb.regr$coefficients[7]
PtsPerGame.beta.hat <- cbb.regr$coefficients[8]
#Under the assumption of a confidence level of 95% we want to keep variables with p-values < 0.05
index.sig.features <- which(summary.cbb.regr$coefficients[2:nrow(summary.cbb.regr$coefficients),4] < 0.05)
index.sig.features
## cleaned.cbb.data$Luck cleaned.cbb.data$AdjEM.1
## 5 6
# Only two features from the data are statistically significant, Luck and AdjEM.1
#Plots
#Residual vs Fitted Values from cbb.regr
plot(y = residuals(cbb.regr), x = fitted.values(cbb.regr))
abline(h = 0, col = 'red')
# In the above code we plotted the residuals from cbb.reg against the fitted.values from cbb.reg.
# We want constant variance, and the data points to be clost to the redline, which would indicate
# that the estimators are close to the actual values. We see constant variance, but linearity is
# not clear from this plot.
#Next we will plot the response versus all predictor variables in the model
#teamWins vs AdjEM: In this plot we can see a linear trend that shows us the higher the AdjEM
# value is, the more wins a team may have. AdjEM stands for adjusted efficiency margin, which is
# the difference between AdjO and AdjD.
regr <- lm(teamWins~cleaned.cbb.data$AdjEM)
plot(x = cleaned.cbb.data$AdjEM, y = teamWins)+abline(regr, col = 'red')
## integer(0)
#teamWins vs AdjO: Each datapoint represents a team, and we can observe that a team with a
# high AdjO (adjusted offense) rating, will likely have more wins. AdjO is adjusted offense
# and is a rating that suggests how many points a team will score per 100 possessions agains
# an average NCAA D-1 defense. A high AdjO raintg suggests a high-powered offensive team
regr <- lm(teamWins~cleaned.cbb.data$AdjO)
plot(x = cleaned.cbb.data$AdjO, y = teamWins) + abline(regr, col = 'red')
## integer(0)
#teamWins vs AdjD: Here we see an inverse trend in relation to the two previous plots, which is
# a good thing. AdjD is an adjusted defense rating, which tells us per 100 possessions how many
# points will be scored against that team against an average NCAA D-1 offense. The lower the AdjD
# rating the better, since a lower value suggests a better defense.
regr <- lm(teamWins~cleaned.cbb.data$AdjD)
plot(x = cleaned.cbb.data$AdjD, y = teamWins)+abline(regr, col = 'red')
## integer(0)
#teamWins vs AdjT: AdjT stands for adjusted tempo, and offers an estimate as to how many
# possessions a team would have in a 40-minute basketball game against a team that has
# an average NCAA-D1 Men's basketball tempo. We do not see any significant trends like we did
# in the other plots, however there is a slight downward trend suggesting that a team with lower # tempo will win more games, probably due to the fact that they have an organized offense.
# A lower tempo suggests a team runs organized plays, which suggests that a team that runs
# organized plays will get better shots, than a team that has no organized offense. However,
# these are assumptions, and the trend is not significant enough visually.
regr <- lm(teamWins~cleaned.cbb.data$AdjT)
plot(x = cleaned.cbb.data$AdjT, y = teamWins)+abline(regr, col = 'red')
## integer(0)
#teamWins vs Luck: Luck is a stat that attempts to explain the difference between a team's actual # winning percentage and an expected win percentage based on a team's game by game efficiencies.
# Basically, a team involved in a lot of close games is not expected to win all of them, those
# that do are considered lucky.
# The line of regression suggests that a team that has more luck will have more #wins, but upon # observing the data points, I do not see a trend strong enough to suggest that. There are teams # with more wins but less luck than other teams that have more luck, and less wins. I assume
# that the teams with the most wins, are just better teams which was indicated by
# some of the above plots. Although luck certainly does play a factor into the amount of wins
# a team will have.
regr <- lm(teamWins~cleaned.cbb.data$Luck)
plot(x= cleaned.cbb.data$Luck, y = teamWins) + abline(regr, col = 'red')
## integer(0)
# teamWins vs AdjEM.1: AdjEM.1 is the adjusted efficiency margin based on the team you are
# playing. Like the plain AdjEM, AdjEm.1 is found by the difference in OppO - OppD. It is
# a rating that tries to define the average strength of the opponents one team matches up against # all season. Essentially this is a strength of schedule rating. The plot's line of regression
# suggests that there is a downward trend between the number of wins by a team, and the AdjEM.1
# rating. This is intuitive, you will not win as many games if you are playing better teams.
# By observing the datapoints alone, you see a slight trend in that direction.
regr <- lm(teamWins~cleaned.cbb.data$AdjEM.1)
plot(x = cleaned.cbb.data$AdjEM.1, y= teamWins) + abline(regr, col = 'red')
## integer(0)
#teamWins vs Points Per Game: Intuitively one would think that the more points a team scores,
# the more wins a team would have. The data slightly suggests that, but there are teams that
# score a lot of points, but play no defense and give up more points than they score. This trend
# isn't observed for every team, but I think it is a fair assumption.
regr <- lm(teamWins~cleaned.cbb.data$PtsPerGame)
plot(x = cleaned.cbb.data$PtsPerGame, y = teamWins) +abline(regr, col = 'red')
## integer(0)
#Normality and Non-Constant Variance
# First, we will plot the residuals vs the fitted values from the simple linear regression
# model, cbb.regr. We are looking at information suggested by the plot
# Observations from code below: In the first plot plot between the residuals from our model
# cbb.regr, and it's fitted values we observe slight heteroscedasticity (non-constant variance), # and slight linearity with some # outliers as well. In the following QQ-Plot the data points
# are centered on the line of regression, except on the ends which we see some slight quadratic # behavior, this indicates normality. We may perform a transformation on y, to try to fit the
# data to line in a more efficient way. Overall I am fairly happy with these plots.
# Residuals vs Fitted Values
plot(y = residuals(cbb.regr), x = fitted.values(cbb.regr), xlab = "Fitted Values", ylab = "Residuals", main = "CBB Regr: Residuals vs Fitted Values") + abline(h= 0, col = 'red')
## integer(0)
#QQ-plot
regr <- lm(teamWins~AdjEM+AdjO+AdjD+AdjT+Luck+AdjEM.1+PtsPerGame, data = cleaned.cbb.data)
qqnorm(residuals(regr), ylab = "Residuals", main = "Residuals vs. Theoretical Quantities")
qqline(residuals(regr))
#Histogram: One main takeaway from the histogram is that we have the expected bell shape.
# This indicated normality in our residuals which that our least squares estimates
# will be "BLUE" (Best Linear Unbiased Estimators)
hist(x = residuals(regr), xlab = "Residuals", ylab = "Frequency", main = "Histogram of Residuals")
#BoxCox : Transformation on the resonse
boxCoxPlot <- boxcox(cbb.regr, plotit = T, lambda = seq(0,3, by = 0.5))
# We want the peak of the solid black curve as our lambda which is the done in the code below
lambda <- boxCoxPlot$x[which.max(boxCoxPlot$y)]
lambda
## [1] 1.121212
# This is the optimal transformation on our response, teamWins. We will take teamWins to the
# power of lambda, and then observe the plots again.
# Refit the model with a boxcox transformation on y
boxcox.regr <- lm(teamWins^lambda ~ AdjEM + AdjO + AdjD + AdjT + Luck + AdjEM.1+PtsPerGame, data = cleaned.cbb.data)
summary(boxcox.regr)
##
## Call:
## lm(formula = teamWins^lambda ~ AdjEM + AdjO + AdjD + AdjT + Luck +
## AdjEM.1 + PtsPerGame, data = cleaned.cbb.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2709 -1.2954 -0.0132 1.1403 6.1377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9290 17.7950 2.806 0.00613 **
## AdjEM 3.3719 5.3309 0.633 0.52861
## AdjO -2.4520 5.3102 -0.462 0.64535
## AdjD 2.1471 5.3402 0.402 0.68858
## AdjT -0.2473 0.2329 -1.062 0.29117
## Luck 63.3160 4.9167 12.878 < 2e-16 ***
## AdjEM.1 -0.8815 0.1088 -8.103 2.2e-12 ***
## PtsPerGame 0.2922 0.1996 1.464 0.14658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.151 on 92 degrees of freedom
## Multiple R-squared: 0.9447, Adjusted R-squared: 0.9405
## F-statistic: 224.7 on 7 and 92 DF, p-value: < 2.2e-16
plot(boxcox.regr)
#New plots with the transformed model
#Residuals Vs Fitted Values for boxcox.regr: In this plot we see constant variance, and linearity because the data points look to be equal on both sides of the line where h = 0. This is an excellent plot.
plot(x = fitted.values(boxcox.regr), y = residuals(boxcox.regr), xlab = "Fitted Values", ylab = "Residuals", main = "BoxCox.regr: Residuals vs Fitted Values") + abline(h = 0, col = 'red')
## integer(0)
#QQ-Plot: In this plot the data looks like it is a better fit to the line, however there is still some quadratic behavior on the both ends, and maybe even outliers. The QQ-plot for the most part indicated normality.
qqnorm(residuals(boxcox.regr), ylab = "Residuals", main = "Residuals vs. Theoretical Quantities")
qqline(residuals(boxcox.regr))
#Histogram: We still have a slight bell like shape in the histogram, which suggests constant variance, or equally distributed variance, and normality which allows us to assume that our least-squares estimators are BLUE.
hist(x = residuals(boxcox.regr), xlab = "Residuals", ylab = "Frequency", main = "Histogram of Residuals")
#More formal test for normality on the transformed model using a shapiro-wilk test. In a Shapiro-Wilk test there is a null hypothesis: the residuals are normal, and an alternative hypothesis: the residuals are not normal. If one of these hypothesis is true, then you can assume that the data is the same as the residuals in regards to being normal or not. The results of the Shapiro-Wilk test on boxcox.regr returned a p-value = 0.2795 which is greater than 0.05, and allows us to Fail To Reject the null hypothesis, suggesting our residuals and data are from a normal distrbution.
shapiro.test(residuals(boxcox.regr))
##
## Shapiro-Wilk normality test
##
## data: residuals(boxcox.regr)
## W = 0.98416, p-value = 0.2757
#Since we are trying to predict the number of wins a team will have in a season based on the features of our model, it is important that we take our response to the power of the exact lambda value. We want to be as precise as possible.
# Shapiro-Wilk Test for Normaility: A Shapiro Wilk test is basically a KS-Test, however the Shapiro-Wilk test is specifically for testing normality.
# Side Note: I decided to perform the shapiro-wilk test on the old regressions model that does not have a response taken to the power of lambda. I was just curious how normal our residuals and data would be even without the transformation.
#Null Hypothesis: The residuals are normal -- We will not reject if p-value > 0.05
#Alternative Hypothesis: The residuals are not normal -- reject if p-value is small
shapiro.test(residuals(cbb.regr))
##
## Shapiro-Wilk normality test
##
## data: residuals(cbb.regr)
## W = 0.98499, p-value = 0.3173
# Test Results: Since we have a p-value = 0.3173 we will FAIL TO REJECT the null hypothesis
# and make the assumption that are residuals are normal as well as the data.
# Provides 100 values that should sum to the number of parameters in our model
hatv <- hatvalues(boxcox.regr)
head(hatv)
## 1 2 3 4 5 6
## 0.19511454 0.16439919 0.08408603 0.14864003 0.11410702 0.09456489
#The sum of all the hat values should be equal to p (8), which is correcct
sum(hatv)
## [1] 8
# Next we would like to identify unusually large values of leverage by using a half-normal plot
# In this plot we find that our two largest leverage points are Virginia, and VCU.
# We also included a QQ-plot of the standardized residuals, and expect the points to follow the line y = x which it does, so normality is again assumed.
halfnorm(hatv, labs = teamNames, ylab = "Leverages")
{qqnorm(rstandard(boxcox.regr))
abline(0,1)}
#Store the leverage points by index, and in decreasing order, so the first index will hold the largest leverage point
leverage.index <- sort(hatv, index = T, decreasing = T)$ix
hatv[leverage.index]
## 1 42 87 2 4 18
## 0.19511454 0.18184442 0.17398613 0.16439919 0.14864003 0.13709931
## 92 99 93 7 73 9
## 0.13693966 0.13633365 0.13532076 0.13218257 0.13026946 0.12991742
## 74 55 76 95 5 45
## 0.12669370 0.12658110 0.12223602 0.12149501 0.11410702 0.11303101
## 22 51 20 48 16 37
## 0.11012557 0.10892914 0.10823802 0.10502385 0.10413142 0.10377519
## 53 46 100 56 10 26
## 0.10329677 0.10178635 0.09985535 0.09952572 0.09818518 0.09790652
## 58 8 6 30 49 80
## 0.09679063 0.09463741 0.09456489 0.08927713 0.08913336 0.08845477
## 17 59 36 3 82 83
## 0.08634948 0.08619267 0.08567589 0.08408603 0.08394075 0.08098979
## 28 97 84 43 85 62
## 0.07875421 0.07662834 0.07480241 0.07448681 0.07295655 0.07277215
## 27 70 86 19 31 15
## 0.07154505 0.07087184 0.07067361 0.06981480 0.06910631 0.06872259
## 81 23 32 77 72 79
## 0.06803570 0.06769698 0.06760490 0.06697447 0.06692252 0.06570001
## 75 98 67 12 25 11
## 0.06384261 0.06275743 0.06259772 0.06145986 0.05930508 0.05926318
## 96 88 13 39 94 44
## 0.05762723 0.05742461 0.05641607 0.05633869 0.05628033 0.05604410
## 35 71 78 21 47 89
## 0.05533745 0.05517878 0.05432961 0.05352895 0.05333148 0.05320818
## 57 63 66 14 29 40
## 0.05214173 0.05213302 0.05007894 0.04955923 0.04838689 0.04710748
## 52 54 68 69 90 91
## 0.04576363 0.04483738 0.04458759 0.04264559 0.04102873 0.04075486
## 65 64 61 24 34 38
## 0.03910588 0.03856627 0.03347807 0.03196818 0.03195382 0.03118506
## 41 60 50 33
## 0.03116552 0.02750213 0.02257451 0.01807197
# By observing hatv[leverage.index] I find that the first 4 leverage points are much larger than the rest so I will save those leverage points in a vector, they could be used in scaling residuals. to get the actual team names that are the leverage points call the indexes stored in points on teamNames
leverage.points <- c(hatv[leverage.index[1:4]])
#Outliers: A point that does not fit the current model well. It is important to be aware of outliers because it enables us to differentiate between an unusual observation, and residuals that are large but not unusual
# Step 1) Compute stundentized residuals for our model
stud <- rstudent(boxcox.regr)
stud[which.max(abs(stud))]
## 2
## -3.363396
# We have the largest residual at index 2 = -3.37561 (Gonzaga)
# will store the 10 largest outliers
index.of.stud.residuals <- sort(abs(stud), index = T, decreasing = T)$ix
range(rstudent(boxcox.regr))
## [1] -3.363396 3.223380
# To determine if this value is in fact an outlier we must compute the Bonferroni critical value, and also check if it is also an influential point
#Influential Points: a point whose removal would cause a large change in fit. An influential point may or may not be an outlier, and it may not have leverage. An influential point will usually have one of these characteristics
#strategy: Use halfnorm with cooks distance
cook <- cooks.distance(boxcox.regr)
halfnorm(cook,3, labs = teamNames, ylab = "Cook's Distances")
#Store the 10 most influential points
cook.index <- sort(cook, index = T, decreasing = T)$ix
influential.points <- c(cook.index[1:10])
# This plot identified our three most influential points
# We will now exclude the largest one (Gonzaga), and see how the fit changes
cook.regr <- lm(teamWins~AdjEM+AdjO+AdjD+AdjT+Luck+AdjEM.1+PtsPerGame, data = cleaned.cbb.data, subset = (cook < max(cook)))
summary(cook.regr)
##
## Call:
## lm(formula = teamWins ~ AdjEM + AdjO + AdjD + AdjT + Luck + AdjEM.1 +
## PtsPerGame, data = cleaned.cbb.data, subset = (cook < max(cook)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8404 -0.7595 -0.0349 0.6277 3.7127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.3410213 10.3017551 3.139 0.00228 **
## AdjEM 0.5728857 3.1132090 0.184 0.85441
## AdjO 0.0008555 3.1019935 0.000 0.99978
## AdjD -0.2004091 3.1195288 -0.064 0.94892
## AdjT -0.1513160 0.1347802 -1.123 0.26452
## Luck 37.7302592 2.8626400 13.180 < 2e-16 ***
## AdjEM.1 -0.5526155 0.0630615 -8.763 9.88e-14 ***
## PtsPerGame 0.2025169 0.1156239 1.752 0.08323 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.245 on 91 degrees of freedom
## Multiple R-squared: 0.9492, Adjusted R-squared: 0.9453
## F-statistic: 243.1 on 7 and 91 DF, p-value: < 2.2e-16
summary(boxcox.regr)
##
## Call:
## lm(formula = teamWins^lambda ~ AdjEM + AdjO + AdjD + AdjT + Luck +
## AdjEM.1 + PtsPerGame, data = cleaned.cbb.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2709 -1.2954 -0.0132 1.1403 6.1377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9290 17.7950 2.806 0.00613 **
## AdjEM 3.3719 5.3309 0.633 0.52861
## AdjO -2.4520 5.3102 -0.462 0.64535
## AdjD 2.1471 5.3402 0.402 0.68858
## AdjT -0.2473 0.2329 -1.062 0.29117
## Luck 63.3160 4.9167 12.878 < 2e-16 ***
## AdjEM.1 -0.8815 0.1088 -8.103 2.2e-12 ***
## PtsPerGame 0.2922 0.1996 1.464 0.14658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.151 on 92 degrees of freedom
## Multiple R-squared: 0.9447, Adjusted R-squared: 0.9405
## F-statistic: 224.7 on 7 and 92 DF, p-value: < 2.2e-16
# When comparing the model cook.regr which excludes the largest influential point, to the summary of the full model we see that the range of the residuals became more compact, AdjEM shrunk by 1/6, AdjO grew exponentially, AdjD decreased by 1/10, AdjT became smaller, Luck shrunk by roughly half, AdjEM.1 shrunk slightly, and PtsPerGame shrunk slightly. Not the actual values but the coefficients associated with each feature.
# We can plot the change in estimate for AdjO
{plot(dfbeta(boxcox.regr)[,2], ylab = "Change in AdjO")
abline (h = 0)}
#Plot change in AdjEm
{plot(dfbeta(boxcox.regr)[,1], ylab = "Change in AdjEM")
abline (h = 0)}
#plot change in AdjD
{plot(dfbeta(boxcox.regr)[,3], ylab = "Change in AdjD")
abline (h = 0)}
#plot change in AdjT
{plot(dfbeta(boxcox.regr)[,4], ylab = "Change in AdjT")
abline (h = 0)}
#Plot change in Luck
{plot(dfbeta(boxcox.regr)[,5], ylab = "Change in Luck")
abline (h = 0)}
#Plot change in AdjEM.1
{plot(dfbeta(boxcox.regr)[,6], ylab = "Change in AdjEM.1")
abline (h = 0)}
#Plot chnage in PtsPerGame
{plot(dfbeta(boxcox.regr)[,7], ylab = "Change in PtsPerGame")
abline (h = 0)}
#Since influential points can be both an outlier, and a leverage point which may change the fit of the model significantly we use cook's distance to look more closely at these idenitified points.
#Leverage Point Index: 1, 42, 87, 2
#Influential Points index: 2, 99, 45, 73, 51, 42, 11, 18, 93, 58
#Outliers Index: 2, 99, 11, 45, 77, 51, 96, 31, 47, 73
#summary of full model
summary(boxcox.regr)
##
## Call:
## lm(formula = teamWins^lambda ~ AdjEM + AdjO + AdjD + AdjT + Luck +
## AdjEM.1 + PtsPerGame, data = cleaned.cbb.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2709 -1.2954 -0.0132 1.1403 6.1377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9290 17.7950 2.806 0.00613 **
## AdjEM 3.3719 5.3309 0.633 0.52861
## AdjO -2.4520 5.3102 -0.462 0.64535
## AdjD 2.1471 5.3402 0.402 0.68858
## AdjT -0.2473 0.2329 -1.062 0.29117
## Luck 63.3160 4.9167 12.878 < 2e-16 ***
## AdjEM.1 -0.8815 0.1088 -8.103 2.2e-12 ***
## PtsPerGame 0.2922 0.1996 1.464 0.14658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.151 on 92 degrees of freedom
## Multiple R-squared: 0.9447, Adjusted R-squared: 0.9405
## F-statistic: 224.7 on 7 and 92 DF, p-value: < 2.2e-16
model.cook.1 <- lm(teamWins~AdjEM+AdjO+AdjD+AdjT+Luck+AdjEM.1+PtsPerGame, data = cleaned.cbb.data, subset = (cook < max(cook)))
summary(model.cook.1)
##
## Call:
## lm(formula = teamWins ~ AdjEM + AdjO + AdjD + AdjT + Luck + AdjEM.1 +
## PtsPerGame, data = cleaned.cbb.data, subset = (cook < max(cook)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8404 -0.7595 -0.0349 0.6277 3.7127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.3410213 10.3017551 3.139 0.00228 **
## AdjEM 0.5728857 3.1132090 0.184 0.85441
## AdjO 0.0008555 3.1019935 0.000 0.99978
## AdjD -0.2004091 3.1195288 -0.064 0.94892
## AdjT -0.1513160 0.1347802 -1.123 0.26452
## Luck 37.7302592 2.8626400 13.180 < 2e-16 ***
## AdjEM.1 -0.5526155 0.0630615 -8.763 9.88e-14 ***
## PtsPerGame 0.2025169 0.1156239 1.752 0.08323 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.245 on 91 degrees of freedom
## Multiple R-squared: 0.9492, Adjusted R-squared: 0.9453
## F-statistic: 243.1 on 7 and 91 DF, p-value: < 2.2e-16
model.cook.2 <- lm(teamWins~AdjEM+AdjO+AdjD+AdjT+Luck+AdjEM.1+PtsPerGame, data = cleaned.cbb.data, subset = (cook < head(max(cook))))
summary(model.cook.2)
##
## Call:
## lm(formula = teamWins ~ AdjEM + AdjO + AdjD + AdjT + Luck + AdjEM.1 +
## PtsPerGame, data = cleaned.cbb.data, subset = (cook < head(max(cook))))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8404 -0.7595 -0.0349 0.6277 3.7127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.3410213 10.3017551 3.139 0.00228 **
## AdjEM 0.5728857 3.1132090 0.184 0.85441
## AdjO 0.0008555 3.1019935 0.000 0.99978
## AdjD -0.2004091 3.1195288 -0.064 0.94892
## AdjT -0.1513160 0.1347802 -1.123 0.26452
## Luck 37.7302592 2.8626400 13.180 < 2e-16 ***
## AdjEM.1 -0.5526155 0.0630615 -8.763 9.88e-14 ***
## PtsPerGame 0.2025169 0.1156239 1.752 0.08323 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.245 on 91 degrees of freedom
## Multiple R-squared: 0.9492, Adjusted R-squared: 0.9453
## F-statistic: 243.1 on 7 and 91 DF, p-value: < 2.2e-16
#nothing changed in either model, they are both the same
#LASSO:
require(lars)
regr.matrix <- model.matrix(teamWins~AdjEM+AdjO+AdjD+AdjT+Luck+AdjEM.1+OppO+OppD+AdjEM.2+PtsPerGame, data = initial.cbb.data)[,-1]
summary(regr.matrix)
## AdjEM AdjO AdjD AdjT
## Min. : 6.450 Min. :102.6 Min. : 84.10 Min. :59.40
## 1st Qu.: 8.988 1st Qu.:109.0 1st Qu.: 94.47 1st Qu.:65.88
## Median :13.905 Median :111.1 Median : 97.40 Median :67.50
## Mean :14.778 Mean :111.8 Mean : 97.05 Mean :67.51
## 3rd Qu.:18.503 3rd Qu.:113.9 3rd Qu.: 99.62 3rd Qu.:68.95
## Max. :34.220 Max. :124.5 Max. :109.00 Max. :74.30
## Luck AdjEM.1 OppO OppD
## Min. :-0.11100 Min. :-5.840 Min. :100.7 Min. : 96.90
## 1st Qu.:-0.02825 1st Qu.: 2.172 1st Qu.:105.6 1st Qu.: 98.95
## Median : 0.00550 Median : 7.605 Median :108.2 Median :100.80
## Mean : 0.00367 Mean : 6.204 Mean :107.5 Mean :101.28
## 3rd Qu.: 0.03925 3rd Qu.:10.652 3rd Qu.:109.5 3rd Qu.:103.25
## Max. : 0.16700 Max. :14.130 Max. :111.8 Max. :108.30
## AdjEM.2 PtsPerGame
## Min. :-12.3500 Min. :65.80
## 1st Qu.: -3.2250 1st Qu.:71.47
## Median : 0.0500 Median :74.00
## Mean : -0.2188 Mean :75.05
## 3rd Qu.: 3.0875 3rd Qu.:78.15
## Max. : 10.3600 Max. :88.80
#create a grid of lambdas
grid.lambda <- 10^seq(10, -2, length = 100)
#set response equal to wins
y <- initial.cbb.data$Wins
#create lasso regression model using glmnet
lasso.regr <- glmnet(regr.matrix, y, alpha = 1, lambda = grid.lambda)
#divid the data into training data and test data
training.data <- sample(1:nrow(regr.matrix), nrow(regr.matrix) / 2)
testing.data <- (-training.data)
yTraining <- y[training.data]
yTesting <- y[testing.data]
# Next, we will fit lasso.regr onto the training data
lasso.regr.train <- glmnet(regr.matrix[training.data,],yTraining,alpha = 1, lamda = grid.lambda)
#In order to find the best value for lambda we have to perform cross-validations
set.seed(1)
cv.out <- cv.glmnet(regr.matrix[training.data, ], yTraining, alpha = 1)
plot(cv.out)
#save the best value for lambda: save the min lambda since we want to minimize lambda in LASSO,
# plot cv.out and add green line that will highlight the best lambda value
best.lambda <- cv.out$lambda.min
{plot(cv.out)
abline(v = log(best.lambda), col = 'green', lwd = 2)}
# Next we find MSPE of the training set
lasso.predict <- predict(lasso.regr.train, s = best.lambda, newx = regr.matrix[testing.data,])
lasso.mspe <- mean((lasso.predict- yTesting)^2)
lasso.mspe # equals 1.563869
## [1] 2.309862
#fit the lasso model using the best lambda
cbb.final <- glmnet(regr.matrix, y, alpha = 1, lambda = best.lambda)
lasso.coefficient <- coef(cbb.final)[1:11,]
lasso.coefficient
## (Intercept) AdjEM AdjO AdjD AdjT
## 23.102628639 0.643895490 0.000000000 -0.090513314 0.000000000
## Luck AdjEM.1 OppO OppD AdjEM.2
## 38.568348840 -0.576504457 -0.013188761 0.000000000 -0.006056161
## PtsPerGame
## 0.054858141
predict.lasso <- predict(cbb.final, s = best.lambda, newx = regr.matrix[testing.data,])
cbb.lasso <- cbind(y[testing.data],predict.lasso)
summary(cbb.lasso)
## V1 1
## Min. :14.00 Min. :15.12
## 1st Qu.:20.00 1st Qu.:20.13
## Median :23.50 Median :23.29
## Mean :23.54 Mean :23.39
## 3rd Qu.:27.00 3rd Qu.:26.28
## Max. :33.00 Max. :36.80
cbb.lasso
## 1
## 2 33 36.80400
## 5 31 31.13024
## 6 30 29.47081
## 8 30 30.18333
## 9 26 26.24777
## 11 30 26.65750
## 12 33 32.13507
## 15 23 22.73400
## 17 26 26.02132
## 21 23 22.66467
## 22 32 30.85766
## 23 20 20.41265
## 25 21 19.28163
## 26 20 19.44854
## 27 29 29.60866
## 28 25 23.74224
## 30 26 24.99928
## 32 20 21.13589
## 34 24 25.20882
## 35 20 20.12816
## 38 28 27.64009
## 39 20 21.03572
## 43 14 15.38249
## 45 29 26.29103
## 48 27 26.12831
## 49 27 28.49906
## 50 20 20.54970
## 52 19 18.36356
## 53 30 29.43568
## 56 22 21.11865
## 57 23 23.25626
## 58 29 26.47738
## 60 20 20.23823
## 61 25 25.63166
## 64 18 18.72578
## 68 15 16.68826
## 71 23 23.77657
## 78 14 15.63376
## 80 24 23.52081
## 81 18 18.30096
## 82 16 16.21020
## 85 24 23.20228
## 88 21 21.17334
## 89 23 23.65601
## 90 21 19.53678
## 91 14 15.11504
## 92 24 23.76384
## 94 17 17.65340
## 96 26 23.33327
## 99 24 20.13987
#In the data frame cbb.lasso there are the rownames which are equal to the teams from the testing.data, Column 1 is "Actual Wins" from the 2018-2019 season, and the second column "Predicted Wins" is the prediction we are making as to how many wins they should achieve in a season.
Write Up:
Statement of question that you want to answer: Answer: Can we use linear regression to accurately predict the wins for a college basketball team in a season?
What data will you analyze to answer your question? Identify at least one data source: Answer: For this project I used a data set from Kenpom.com and I added a feature for total wins, and points per game. I used the 2018-2019 Men’s College Basketball Season dataset from last year. There were 100 observations and 11 variables to start from.
Data Cleaning: First I had to add two columns to the dataset to make predicting how many wins a team will achieve in a season more viable. I added a column for total wins for 2018 season, and points per game for the 2018 season as well. There were no NA’s in the data, but in the above section, ‘pre-processing and cleaning data’ we still went through the motions and looked for NA’s, however when we summed them up we got a value equal to 0. We used excel to create the initial csv file, but excel added on a special character to the end of each Team’s name. We had to work through that and strip the special excel character from the team name. After that we calculated the coefficient of variation, which then allowed us to check the variance of each column and eliminate any features that had little to no variance. Initially, I wanted to remove any features that had a variance of less than 0.05, however after looking at the summary of the coefficient of variation I saw that 25% of our columns had a variance of less than 0.03. We changed our mind, and decided to eliminate any column that had a variance that fell equal to or less than the first quartile. Pretty much any column that had a variance less than 0.03, we removed. We ended up removing three variables the dataset when we created our first model. The features were Adjusted Offense, Adjusted Defense, and AdjEM.2 (which measures strength of schedule). This was pretty intuitive to me, because one of the most statistically significant features was AdjEM, and that takes into account for Adjusted Offense and Adjusted Defense within it’s formula.
Exploratory Analysis: During this phase of the project we had a regression model with 8 variables out of the 11 we started with. After running the summary function on our first model, we hade a Multiple R-Squared value = 0.9444, which indicates that with our data we have the ability to explain for over 94% of the total variance within the data. We also had a Residual Standard error equal to 1.319, which made me very happy. Like I stated earlier I was very surprised to see how high the coefficient for Luck was. We calculated the coefficient for Luck equal to 38.8316. The next highest coefficient was 2.0372, so I was very surprised by that. Our dataset only had data for the top 100 teams from last season. I didn’t think luck would have been as important as it was because all the teams we were analyzing were among the elite college basketball teams last season. I thought that they would have been more skilled, and relied less on luck. As far as the dataset goes, we had 100 observations (teams), and 11 variables. We had to understand how each feature was going to help explain the data. The first variable is teamWins, which did not take into account losses. The next variable was AdjEM which is the total efficiency margin for a team, and is calculated by Subtracting Adjusted offense from Adusted Defense. Adjusted Offense (AdjO) tells us how many points on average a team will score against an average Division 1 Men’s College basketball defense per 100 possessions. Adjusted defense (AdjD) tells us how many points an average offensive team will score per 100 possessions against a specific team. Adjusted tempo (AdjT) is an estimate of the possessions a team will have per 40 minutes. or per game. Luck is defined as if a team is involved in many close games, they are not expected to win all of them. If they do, that is considered “lucky.” Luck is actually calculated as a penalty, and the higher a team’s luck rating is the lower they will be rated in Kenpom’s ranking hierarchy.Adjusted Efficienncy Margin.1 takes into account the opponent’s Adjusted offense, and Adjusted defense. Theoretically if the teams, Team A is playing have high AdjEM.1 ratings, then the win expectancy for team A will go down, because they are playing better teams. It is pretty intuitive, but it was cool to have a coefficient that shows how much a team’s total wins will be affected by this rating. For every 1 unit increase of a team’s AdjEM.1 rating, their win expectancy falls by 1/2 of a win. I thought that was pretty interesting. The last variable is Points Per Game, which I initially thought would have a more significant effect than it did. In fact, I probably could have not added that column, it didn’t seem to be significant at all. From the data exploration we hypothesized that A team with a higher AdjEM will have a higher total of wins. To have a high AdjEM a team has to have a high Adjusted Offensive rating(meaning they score a lot), and a low Adjusted Defense rating (they do not get scored on a lot). That was a pretty intuitive hypothesis. Also a team with a high luck rating will have more wins.
c) Variable Selection: Initially we just used the coefficient of variation to select our variables. We looked at the p-values
for each variable, and wanted to remove any features that had a p-value greater than 0.05. However, upon
reading the summary of our first linear model, we had a residual standard error of 1.319 which is very good,
and a Multiple R-Squared value, and Adjusted R-squared value that told us we could explain over 94% of the
variance. I didn't want to touch that. Actually I excluded the Points Per Game variable, for fun, and
our Multiple R-Squared value grew, indicating that we should probably exclude that variable our all together.
However, I really did think that Points Per Game would be significant in determining a team's total
wins, and I decided to leave it in the model until we tried to evolve our model later on in the project. In our model
we ended up keeping the variables: AdjEM, AdjO, AdjD, AdjT, Luck, AdjEM.1, and Points per game. I was ok with those variables
because I thought it was intuitive that those variables stayed in the model.
d) Check the Assumptions of the Model: The first thing we did was create a plot that measured the residuals of the model vs the fitted values. In this kind of plot were hoping to see constant-variance (homoscedascity), and a linear trend with data
grouped around the 0-line. Luckily for us, we saw that right away. We didn't have to do anything to stabilize
the variance. We were pretty lucky in that regard, we had real good data. Right away, we were able to assume
linearity, and homoscedascity. Next, we plotted the response (teamWins) against each variable in our model to
look for any linear trends visually. We saw that there was an upward linear trend between teamWins, and
AdjEM, as well as Adjusted offense. There was a negative trend when we plotted the response against Adjusted
defense, which made sense. The higher an Adjusted defensive rating is, the more points are being scored on
that team which implies less total wins. It's hard to win when you cannot stop anyone on defense. Next, we
wrote the code to create a QQ-plot, and histogram of the residuals. The QQ-plot grew our confidence that our
was normal, because the points lined up on the against the qqline. Normality was assumed, but just to be safe
we ran a Shapiro-Wilk test for normality. The null hypothesis being that the residuals are normal, and the
alternative hypothesis being the latter. The test returned a p-value greater than 0.05, which allowed us to
Fail To Reject the null hypothesis, and make the assumption that our data was in fact normal. The histogram
returned a nice bell-shape throughout the graph which, again, allowed us to assume our residuals were normal,
and allowed us to assume that our estimators are in fact BLUE.After completing the Shapiro-Wilk test, we ran
the boxcox function to find the best lambda in case we wanted to transform the response. We honestly didn't
need to, just because all of the other tests we performed reinforced the assumptions of our model, but since
we were going to be making predictions, I thought it would behoove us receive an optimal lambda value, and
transform the response. We received a lambda value of 1.111111, which was close to 1.Again, since we were
going to be performing prediction, I thought it was best to transform the response by taking it to the power
of our lambda value.
e) Final Fit and interpretation: After we cleaned the data, explored the data, chose our variables, and check the assumptions of our model we were
ready to to do a final fit, and interpret the data. First we checked for leverage points. There were 4 that were separated from the rest. I stored those in a vector, and then we used a half-norm plot to visually show our leverage points. Next we looked for outliers, and I stored the ten highest outliers, or what may be outliers. I didn't want
to exlude them from our model right away, because there was a good possibility that those outliers were also
influential points, that would adjust the fit of the model if they were removed. After we identified our outliers
we would compare them against the Bonferroni critical vaue, that would indicate to us the likelihood of the possible
outliers actually being an outlier. Even then though we had to still check for influential points as well before we
decided to remove any observtions from the data. To check for influential points we used Cook's distance, and a halfnorm plot again. Gonzaga was actually an influential point, and upon removing that data point or observation
from the dataset, our model changed greatly. I decided to leave that datapoint within the dataset. In the code above
I actually plotted the change in each variable after Gonzaga was removed. Then we created two models each with a
different subset, in an attempt to see how the data performed with the contraints. Actually both models did not
change at all even though, we had the two subsets. Every value stayed exactly the same, which doesn't seem right to
me, but I didn't want to ruin the model trying to do too much. Finally after performing all of that, we were ready
to move forward and start LASSO. We made a regression matrix that stored the full model, then we made a grid of
lambdas. After storing the response as y, we created our first LASSO regression model using glmnet.Then we separated
the data into a training set, and a test set. Then we fit the LASSO regression model onto the training data before
we performed cross-validation to obtain the best lambda value. Which from the plot is the minimum value along curve
unlike boxcox. We used our best lambda to calculate the MSPE of the model. Once we did that we were able to able
to fit the LASSO regression model, with the help of our best lambda, onto the test data. Lasso selected the
variables: AdjEM, AdjD, Luck, AdjEM.1, OppO, and PtsPerGame to use in predicting how many wins a team should win
during the seasn. This was very similar to the first model we made using the coefficients fo variation to select
variables in the begining. However there were some difference. Lass didn't use AdjO where we did, and LASSO selected OppD which we left out in our first model.We then made a data frame that used the team names from the test data as row names, actual win for the first column, and the predicted amount of wins for the second column. I was actually astonished by how accurate the prediction values were in comparison to the actual values. It's pretty amazing, and
something I would like to explore more.